import sys
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["pdf.fonttype"] = 42
sc.settings.set_figure_params(dpi=120)
sns.set_style("dark")
from sklearn.svm import SVR
def filter_cv_vs_mean(S: np.ndarray, N: int, svr_gamma: float=None, plot: bool=True, min_expr_cells: int=2,
max_expr_avg: float=20, min_expr_avg: float=0) -> np.ndarray:
muS = S.mean(1)
detected_bool = ((S > 0).sum(1) > min_expr_cells) & (muS < max_expr_avg) & (muS > min_expr_avg)
Sf = S[detected_bool, :]
mu = Sf.mean(1)
sigma = Sf.std(1, ddof=1)
cv = sigma / mu
log_m = np.log2(mu)
log_cv = np.log2(cv)
if svr_gamma is None:
svr_gamma = 150. / len(mu)
svr = SVR(gamma=svr_gamma)
svr.fit(log_m[:, None], log_cv)
fitted_fun = svr.predict
ff = fitted_fun(log_m[:, None])
score = log_cv - ff
xnew = np.linspace(np.min(log_m), np.max(log_m))
ynew = svr.predict(xnew[:, None])
nth_score = np.sort(score)[::-1][N]
if plot:
plt.scatter(log_m[score > nth_score], log_cv[score > nth_score], s=3, alpha=0.4, c="tab:red")
plt.scatter(log_m[score <= nth_score], log_cv[score <= nth_score], s=3, alpha=0.4, c="tab:blue")
mu_linspace = np.linspace(np.min(log_m), np.max(log_m))
plt.plot(mu_linspace, fitted_fun(mu_linspace[:, None]), c="k")
plt.xlabel("log2 mean S")
plt.ylabel("log2 CV S")
cv_mean_score = np.zeros(detected_bool.shape)
cv_mean_score[~detected_bool] = np.min(score) - 1e-16
cv_mean_score[detected_bool] = score
cv_mean_selected = cv_mean_score >= nth_score
return cv_mean_selected
adata1 = sc.read_h5ad("D30_Sorted_Joint_Analysis.h5ad")
original_data = sc.read_loom("hESC-RPE_invitro_differentiation.loom")
original_data
adata2 = original_data[original_data.obs["Day"]=="D30"].copy()
adata2
adata2.obs["cell_type"] = adata2.obs["CellTypeCategory2"]
adata2.obs["sample"] = "Unsorted"
adata = adata2.concatenate(adata1)
adata.obs["batch"].value_counts()
from collections import Counter
Counter(adata.obs["sample"])
adata
adata_raw = adata.copy()
sc.pp.filter_genes(adata, min_cells=20)
sc.pp.normalize_total(adata)
adata.raw = adata
S_genes_hum = ["MCM5", "PCNA", "TYMS", "FEN1", "MCM2", "MCM4", "RRM1", "UNG", "GINS2",
"MCM6", "CDCA7", "DTL", "PRIM1", "UHRF1", "CENPU", "HELLS", "RFC2",
"RPA2", "NASP", "RAD51AP1", "GMNN", "WDR76", "SLBP", "CCNE2", "UBR7",
"POLD3", "MSH2", "ATAD2", "RAD51", "RRM2", "CDC45", "CDC6", "EXO1", "TIPIN",
"DSCC1", "BLM", "CASP8AP2", "USP1", "CLSPN", "POLA1", "CHAF1B", "BRIP1", "E2F8"]
G2M_genes_hum = ["HMGB2", "CDK1", "NUSAP1", "UBE2C", "BIRC5", "TPX2", "TOP2A", "NDC80",
"CKS2", "NUF2", "CKS1B", "MKI67", "TMPO", "CENPF", "TACC3", "PIMREG",
"SMC4", "CCNB2", "CKAP2L", "CKAP2", "AURKB", "BUB1", "KIF11", "ANP32E",
"TUBB4B", "GTSE1", "KIF20B", "HJURP", "CDCA3", "JPT1", "CDC20", "TTK",
"CDC25C", "KIF2C", "RANGAP1", "NCAPD2", "DLGAP5", "CDCA2", "CDCA8", "ECT2",
"KIF23", "HMMR", "AURKA", "PSRC1", "ANLN", "LBR", "CKAP5", "CENPE",
"CTCF", "NEK2", "G2E3", "GAS2L3", "CBX5", "CENPA"]
sc.tl.score_genes_cell_cycle(adata, s_genes=S_genes_hum, g2m_genes=G2M_genes_hum)
cv_vs_mean_keep = filter_cv_vs_mean(adata.X.toarray().T, N=1000, max_expr_avg=50)
sc.pp.log1p(adata)
adata = adata[:, cv_vs_mean_keep].copy()
sc.pp.regress_out(adata, ['S_score'])
sc.pp.regress_out(adata, ['G2M_score'])
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color=['PDGFRB', 'NCAM1', 'sample'])
sc.pl.pca_variance_ratio(adata, log=True)
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=5)
sc.tl.umap(adata, alpha=0.3, min_dist=0.5, random_state=0)
sc.tl.louvain(adata, resolution=1)
sc.pl.umap(adata, use_raw=True, color=['louvain', "phase", 'sample'], ncols=3)
adata_raw.obsm["X_umap"] = adata.obsm["X_umap"]
adata_raw.obs["louvain"] = adata.obs["louvain"]
adata_raw_norm = adata_raw.copy()
sc.pp.normalize_total(adata_raw_norm)
sc.pp.log1p(adata_raw_norm)
sc.pl.umap(adata_raw_norm, use_raw=False, color=["NCAM1", "PDGFRB"], ncols=4)
sc.pl.umap(adata_raw_norm, use_raw=False, color=["MITF", "TYRP1", "SERPINF1", "DCT", "SFRP5",
"RLBP1", "BEST1", "RPE65", "TTR", "sample"], ncols=5)
adata_integrate_init = adata_raw.copy()
sc.pp.filter_genes(adata_integrate_init, min_cells=20)
adata_integrate_init = adata_integrate_init[:, cv_vs_mean_keep].copy()
sc.pp.normalize_total(adata_integrate_init)
sc.pp.log1p(adata_integrate_init)
b0 = adata_integrate_init[adata_integrate_init.obs["sample"]=="Unsorted"].copy()
b1 = adata_integrate_init[adata_integrate_init.obs["sample"]!="Unsorted"].copy()
b0
b1
from miscalg import integration as seurat
b0_dict = {}
b1_dict = {}
b0_dict["normalized_array"] = np.array(b0.X.toarray())
b1_dict["normalized_array"] = np.array(b1.X.toarray())
b0_dict["GeneID"] = np.array(b0.var.index)
b1_dict["GeneID"] = np.array(b1.var.index)
anchor_pairs, anchor_scores, indicators_pairs = seurat.find_integration_anchors(
object_list = [b0_dict, b1_dict],
dims=15,
anchor_features=b0.shape[1],
normalization_method=None,
dim_reduction="crossSVD")
b0_integrated, b1_integrated = seurat.run_integration(data_matrices = (b0_dict["normalized_array"],
b1_dict["normalized_array"]),
anchor_pairs=anchor_pairs,
anchor_score = anchor_scores)
adata_integrated = sc.AnnData(b0_integrated).concatenate(sc.AnnData(b1_integrated))
adata_integrated.obs["S_score"] = [i for i in adata.obs["S_score"]]
adata_integrated.obs["G2M_score"] = [i for i in adata.obs["G2M_score"]]
sc.pp.regress_out(adata_integrated, ['S_score'])
sc.pp.regress_out(adata_integrated, ['G2M_score'])
sc.tl.pca(adata_integrated, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata_integrated, log=True)
adata_integrated.obs["sample"] = [i for i in adata.obs["sample"]]
sc.pp.neighbors(adata_integrated, n_neighbors=30, n_pcs=5)
sc.tl.umap(adata_integrated, alpha=0.3, min_dist=0.5)
sc.tl.louvain(adata_integrated, resolution=1)
adata_integrated.obs["sample"] = [i for i in adata.obs["sample"]]
sc.pl.umap(adata_integrated, use_raw=False, color=['louvain', 'batch', 'sample'], ncols=4)
adata_integrated.obs.index = adata_integrate_init.obs.index
adata_integrated.var.index = adata_integrate_init.var.index
n2c = {"Unsorted":"grey", "NCAM1-High":"#ff40ff", "CD140b-High":"#0432ff"}
adata_raw_norm.obsm["X_tsne"] = adata_integrated.obsm["X_umap"]
adata_raw_norm.obs["integrated_louvain"] = [i for i in adata_integrated.obs["louvain"]]
tsne = adata_raw_norm.obsm["X_tsne"]
plt.scatter(tsne[:, 0], tsne[:, 1], c=[n2c[i] for i in adata_integrated.obs["sample"]], s=5, alpha=0.5, rasterized=True)
plt.axis("off")
plt.show()
sc.pl.tsne(adata_raw_norm, color=['PDGFRB', 'NCAM1', 'sample'])
em = adata_raw_norm.obsm["X_tsne"]
plt.scatter(em[:, 0], em[:, 1], s=5)
plt.axvline(11.25)
plt.axhline(9.4)
nl = []
for x,y,l in zip(em[:, 0], em[:, 1], adata_raw_norm.obs["integrated_louvain"]):
if y<2.5 and l == "7":
nl.append("13")
elif x<11.25 and y>9.4 and l=="7":
nl.append("12")
elif x>3 and y<2.5 and l in ["10", "4", "6"]:
nl.append("13")
else:
nl.append(l)
adata_raw_norm.obs["integrated_louvain"] = nl
sc.pl.tsne(adata_raw_norm, use_raw=False, color=['integrated_louvain', 'sample'], ncols=4)
louvain2name = {'0':"MidRPE", '1': "EarlyRPE", '10':"NeuralCrest", '2': "LateRPE", '3':"MidRPE",
'4':"EarlyRPE", '5':"MidRPE", '6':"RetProg", '7':"RetProg", '8': "EMT-RPE", '9':"EMT-RPE",
"12":"NeuralRet", "13":"Neuronal"}
adata_raw_norm.obs["cell_type_new"] = [louvain2name[i] for i in adata_raw_norm.obs["integrated_louvain"]]
sc.pl.tsne(adata_raw_norm, use_raw=False, color=['cell_type_new', 'sample', "POU5F1"], wspace=0.45, ncols=4)
n2n = {'EMT-RPE':'#1f77b4',
'EarlyRPE': '#ff7f0e',
'LateRPE': '#279e68',
'MidRPE': '#d62728',
'NeuralCrest': '#aa40fc',
'NeuralRet': '#8c564b',
'Neuronal':'#e377c2',
'RetProg':'#b5bd61'}
plt.scatter(tsne[:, 0], tsne[:, 1], c=[n2n[i] for i in adata_raw_norm.obs["cell_type_new"]],
s=5, alpha=0.5, rasterized=False)
plt.axis("off")
plt.show()
n2n = {'EMT-RPE':0,
'EarlyRPE': 1,
'LateRPE': 2,
'MidRPE': 3,
'NeuralCrest': 4,
'NeuralRet': 5,
'Neuronal':6,
'RetProg':7}
from miscalg import enrichment
Xgen = adata_raw_norm.X.toarray().T
enrichment_score = enrichment.enrichment_score(Xgen, np.array([n2n[i] for i in adata_raw_norm.obs["cell_type_new"]]))
emarkers = enrichment.extract_enriched(enrichment_score,
np.array(adata_raw_norm.var.index), n_enriched=30)
edf = pd.DataFrame(emarkers)
edf.head(20)
genes = ["ATOH7", "POU4F2", "NEUROD1", "STMN2",
"RAX", "VSX2", "SFRP2", "CRB1",
"MITF", "TYRP1", "PMEL", "DCT",
"TYR", "RLBP1", "BEST1", "RPE65",
"EMX2", "ARX", "RSPO3", "FGF17",
"WNT6", "DLX5", "TFAP2C", "S100A14",
"ACTA2", "TAGLN", "GJA3", "WNT2B"
]
X = pd.DataFrame(adata_raw_norm[:, genes].X.toarray())
X.index = adata_raw_norm.obs["cell_type_new"]
X.columns = genes
X = X.groupby(X.index).mean()
X = X.loc[["NeuralRet", "RetProg", "EarlyRPE", "MidRPE", "LateRPE", "Neuronal", "NeuralCrest", "EMT-RPE"]]
plt.figure(None, (16, 4))
sns.heatmap((X - X.min()) / (X.max() - X.min()), cmap='inferno')
plt.show()
df = adata_raw_norm.obs
df.groupby(["cell_type_new", "sample"])["sample"].value_counts()/df.groupby(["sample"])["sample"].value_counts()*100
yticks = ["NeuralRet", "RetProg", "EarlyRPE", "MidRPE", "LateRPE", "Neuronal", "NeuralCrest", "EMT-RPE"]
xticks = ["NCAM1-High", "CD140b-High", "Unsorted"]
x = [0]*8 + [1]*8 + [2]*8
y = [1, 2, 3, 4, 5, 6, 7, 8] * 3
s = [2.179837, 64.713896, 20.572207, 2.588556, 0, 4.495913, 3.269755, 2.179837]
s += [0.134590, 0.942127, 27.927322, 42.866756, 17.563930, 0.740242, 0.874832, 8.950202]
s += [0.215983, 7.451404, 23.056156, 41.576674, 11.609071, 0.701944, 2.915767, 12.473002]
c = ['#8c564b', '#b5bd61', '#ff7f0e', '#d62728', '#279e68', '#e377c2', '#aa40fc', '#1f77b4'] * 3
sns.set_style("white")
plt.scatter(x[::-1]+[3]*6, y+[1, 2, 3, 4, 5, 6], s=np.array(s[::-1]+[1, 10, 25, 50, 75, 100])*5, c=c+["black"]*6)
plt.xticks(range(0, 3), xticks, rotation=90)
plt.yticks(range(1, 9), yticks[::-1])
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.xlim(-1, 5)
plt.show()
# legend
x = [0]*6
y = [1, 2, 3, 4, 5, 6]
s = [1, 10, 25, 50, 75, 100]
sns.set_style("white")
plt.scatter(x[::-1], y, s=np.array(s)*5, c='black')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.ylim(0, 7)
plt.axis("off")
plt.show()
a_tmp = adata_raw_norm[adata_raw_norm.obs["sample"].isin(["NCAM1-High", "CD140b-High"])]
markers = ["SFRP5", "TTR", "SLC35D3", "TYR", "RLBP1",
"FEZF2", "CRB1", "SOX2", "FGF9", "VSX2", "NCAM1", "PDGFRB"]
sc.pl.dotplot(a_tmp, markers, groupby='sample', dendrogram=False)
adata_raw_norm
adata_raw.write_h5ad("D30_Sorted_Integrated.h5ad")
adata_raw.obsm["X_umap"] = adata_raw_norm.obsm["X_tsne"]